The captions in the graph can represent the flow of data exploration. # Import and name variables and factors properly

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.7
## ✔ tibble  3.1.8     ✔ stringr 1.4.0
## ✔ tidyr   1.1.4     ✔ forcats 0.5.1
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
SeedMethod <- read_csv("/Users/panareeboonyuen/Thai-rice-open-data-Github/data/raw-data/MethodSeedDensityAnalysis.csv")
## Rows: 894 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): province ENG, method
## dbl (9): year, ID_01, planting area(rai), harvesting area (rai), production ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(SeedMethod)[11]<- "SeedUtilise"
names(SeedMethod)[9]<- "YieldHA"
names(SeedMethod)[8]<- "YieldPA"
names(SeedMethod)[3]<- "Province"
#Change "PlacingMachine" to a better term "SowingMachine"
SeedMethod$method[SeedMethod$method == "PlacingMachine"] <- "DirectSow"

Add region

region_provincelist <- read_csv("data/raw-data/region-provincellsit.csv",col_names = F)
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): X1, X2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NorthEast<- region_provincelist[1,2]
North <- region_provincelist[2,2]
South <- region_provincelist[3,2]
Central <- region_provincelist[4,2]
East <- region_provincelist[5,2]
West <- region_provincelist[6,2]
library(stringr)
FormProvinceStr <- function(Listcomma) {
  Listcomma <- str_replace_all(Listcomma, " ", "")
  Listcomma2 <- str_split(Listcomma, ",", simplify = TRUE)
  return(Listcomma2)
}

#use with each list
North <- FormProvinceStr(North)
NorthEast <- FormProvinceStr(NorthEast)
South <- FormProvinceStr(South)
Central <- FormProvinceStr(Central)
East <- FormProvinceStr(East)
West <- FormProvinceStr(West)

SeedMethod$Province<-str_replace_all(SeedMethod$Province, " ", "")

provinceList <- c(NorthEast, North, South, Central, East, West)
provinceList <-
  provinceList[order(provinceList)] #order alphabetically
#length(provinceList) == 77 checked

SeedMethod<-SeedMethod %>% 
  mutate(
    region = case_when(
      match(Province,NorthEast)>0 ~ "NorthEast",
      match(Province,North)>0 ~ "North",
      match(Province,South)>0 ~ "South",
      match(Province,Central)>0 ~ "Central",
      match(Province,East)>0 ~ "East",
      match(Province,West)>0 ~ "West",
      TRUE ~ "CheckSpell"
    )
  )

Regional trend

SeedMethod %>%
  ggplot() +
  geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = method), size = 1)+
  labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)", caption = "#North have two types of responses of yields to seed use increase, for Dry and Wet Broadcast,\nwhile other regions had consistent trends (positive correlation)\n#NE was not competitive at all for Dry Broadcast and Wet Broadcast\n(but this may be because of low-yielding KDML105 was planted the most in NE)\n#No Sowing machine method for Central and East seem fishy
#No other management and cultivar information! - the problem of confounding factor
")+
  facet_wrap(~(region))

Look at broadcasting - Northern

SeedMethod %>% filter(region == "North") %>% filter(method == "DryBroadcast")  %>% 
  ggplot() +
  geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = Province))+
  labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield (Kg/rai)", title = "Northern province level - DryBroadcast")+
  scale_y_continuous(limits = c(300,750))

SeedMethod %>% filter(region == "North") %>% filter(method == "WetBroadcast")  %>% 
  ggplot() +
  geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = Province))+
  labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield (Kg/rai)", title = "Northern province level - WetBroadcast")+
  scale_y_continuous(limits = c(300,750))

cat("Two schools of thought for how much seed they should use
Use low seed amount = ChaingMai, ChaingRai, Phrae, LamPang, LamPhun, Phayao
Use high seed amount = Uttaradit, kamphaengPhet, Phetchabun, Phichit, Phitsanulok, Sukhothai\n")
## Two schools of thought for how much seed they should use
## Use low seed amount = ChaingMai, ChaingRai, Phrae, LamPang, LamPhun, Phayao
## Use high seed amount = Uttaradit, kamphaengPhet, Phetchabun, Phichit, Phitsanulok, Sukhothai
cat("Questions:
1. What forces high-seed provinces to use a higher seed amount, which is a waste of money 
2. What if they use lower seed?
3. What cause high variation in low-seed use provinces?
4. What make CM having so high yield")
## Questions:
## 1. What forces high-seed provinces to use a higher seed amount, which is a waste of money 
## 2. What if they use lower seed?
## 3. What cause high variation in low-seed use provinces?
## 4. What make CM having so high yield

look at provincial level

SeedMethod %>%
  ggplot() +
  geom_point(mapping = aes(x=SeedUtilise,y=YieldPA, col = method))+
  labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)")+
  facet_wrap(~Province)

#we can see that there are too few data points to analyse each of the provinces 
ggplot(data = SeedMethod) +
  geom_bar(mapping=aes(x=Province, fill = method), orientation = "x")+
  theme(axis.text.x =element_text(angle = 90, size = 5))

#filter only high count province
ObsCount<-SeedMethod %>%  group_by(Province) %>% summarise(n = n())
HighObsCount <- ObsCount  %>% filter(n>=15)
#There are 17 provinces with equal or more than 15 obeservations
SeedMethod %>% filter(match(Province,HighObsCount$Province)>0) %>%
  ggplot() +
  geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = method))+
  labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)", caption = "The distribution of data points are so similar;\neach of the four corners are represented by the same methods\nBut we cannot just recommend the whole country to do transplanting\nas we do not have other information e.g. cultivar and irrigation status")+
  facet_wrap(~Province)

Comoparinng

From DSSAT, simulation shows that Direct sowing (DS) give lower, more spread yields, while transplanting (TP) give higher, less spread yield. We will check that with the real data from OAE

ggplot(SeedMethod) +
  geom_histogram(mapping = aes(YieldHA, fill = region), position = "stack")+
  facet_wrap(~method)+
  labs(caption = "From the whole-country look, 
Mean yield from DS fields is indeed smaller than that of TP DS has a narrower spread than TP. 
However, them DS data come from only 4 regions (no East and Central),\nwhile TP data come from 6, so we will plot them by region. ", x = "Yield (kg/rai)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Region considered
ggplot(SeedMethod %>% filter(method %in% c("DirectSow", "Transplant"))) +
  geom_histogram(mapping = aes(YieldHA, fill = region), position = "identity")+
  facet_grid(method~region)+
  theme(axis.text.x = element_text(angle = 90, size = 5))+
  labs(x = "Yield (kg/rai)", caption = "Comparing the histograms of DS and TP of the same region,  
  TP created a wider spread of yields. 
Mean yield shows the sample expected trends
")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#get stat
SeedMethod %>% group_by(method, region)  %>% summarise(n = n(), meanY = round(mean(YieldHA),digits = 2), sdY = round(sd(YieldHA),digits = 2), CVY = round(mean(YieldHA)/sd(YieldHA), digits = 2)) %>% filter(method %in% c("DirectSow", "Transplant")) 
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 6
## # Groups:   method [2]
##    method     region        n meanY   sdY   CVY
##    <chr>      <chr>     <int> <dbl> <dbl> <dbl>
##  1 DirectSow  North        39  404.  67.2  6.01
##  2 DirectSow  NorthEast    34  344.  29.4 11.7 
##  3 DirectSow  South        24  324.  32.7  9.92
##  4 DirectSow  West          8  392.  84.6  4.63
##  5 Transplant Central      44  676.  50.1 13.5 
##  6 Transplant East         32  486. 107.   4.55
##  7 Transplant North        56  583.  49.9 11.7 
##  8 Transplant NorthEast    80  369.  24.6 15.0 
##  9 Transplant South        54  393.  59.3  6.63
## 10 Transplant West         20  609. 105.   5.8
# Seed utilisation considered
ggplot(SeedMethod %>% filter(method %in% c("DirectSow", "Transplant")) )+
  geom_point(aes(x = SeedUtilise, y = YieldHA, col = method))+
  labs(x = "Seed Utilisation Rate (kg/rai)", y = "Yield (kg/rai)")+
  labs(caption = "In addition to region, another available factor that affect yields is seed utilisation rate. 
       Its range for TP is actually greater than that for DS at country-level, 
       possibly creating a wider spread or yield. ")

#produce group , suing 8 and 14 as division
lowseed <- SeedMethod %>% group_by(method,region) %>% filter(SeedUtilise <= 8,method %in% c("DirectSow", "Transplant")) 
lowseed$group <- rep("low", nrow(lowseed))
highseed <- SeedMethod %>% group_by(method,region) %>% filter(SeedUtilise > 8, SeedUtilise <= 14,method %in% c("DirectSow", "Transplant"))
highseed$group <- rep("high", nrow(highseed))
seedu2met <- rbind(lowseed, highseed)

ggplot(seedu2met)+
  geom_histogram(aes(x=YieldHA, fill = method), position = "identity", alpha = 0.5)+
  facet_grid(region~group)+
  labs(caption = "Mean yield show the consistent expected trends: 
  higher for TP for all region and all seed utilisation group  
  <still confounding = management and cultivar and weather>")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

seedu2met %>% group_by(group, region, method) %>% summarise(n = n(), meanY = mean(YieldHA), sdY = sd(YieldHA), CVY = mean(YieldHA)/sd(YieldHA))  %>% filter(method %in% c("DirectSow", "Transplant"), region %in% c("North", "NorthEast", "South", "West")) %>% ggplot()+
  geom_col(aes(x= group , y = CVY, fill = method), position = "dodge")+
  facet_wrap(~region, ncol = 4)+
  labs(caption = "For spread, CV of TP (expected to be lower) is actually lower than that of DS 
  in 3 out of 6 cases: in south for both seed use rates, 
  and in west when the seed use rate in between 8-14 kr/rai (high)
")
## `summarise()` has grouped output by 'group', 'region'. You can override using
## the `.groups` argument.

#Regression modelling * We will try using the data from Transplant (TP) and DirectSow (DS) fields to find a regression model of yield. Yield = response variable, and 1) method, 2) seed utilisation, and 3) region will be candidate explanatory variables. * Note that there are MAJOR confounding factors (variables that can affect yield that have not been controlled/known) such as varieties of rice, management (irrigation, fertilisation etc). * However, we will attempt forming the model, because at least, the trend of yields are monotonically increasing (more so for TP) with respect to seed utilisation rate for TP and DS fields. We will only use data from 4 regions (“North”, “NorthEast”, “South”, “West”) because the other two regions have no records of DS fields

#correlation between seed utilisation rate and yield
mono_df <-
  SeedMethod %>% filter(method %in% c("Transplant", "DirectSow"), region %in% c("North", "NorthEast", "South", "West"))

mono_df %>%
  ggplot() +
  geom_point(mapping = aes(x = SeedUtilise, y = YieldHA, col = method))

mod <- lm(YieldHA ~ method * SeedUtilise * region, data = mono_df)
coef(mod)
##                                  (Intercept) 
##                                    639.04245 
##                             methodTransplant 
##                                    -41.88269 
##                                  SeedUtilise 
##                                    -23.05364 
##                              regionNorthEast 
##                                   -335.89031 
##                                  regionSouth 
##                                   -346.49323 
##                                   regionWest 
##                                    131.16428 
##                 methodTransplant:SeedUtilise 
##                                     21.79267 
##             methodTransplant:regionNorthEast 
##                                     30.89769 
##                 methodTransplant:regionSouth 
##                                     83.74198 
##                  methodTransplant:regionWest 
##                                   -482.46376 
##                  SeedUtilise:regionNorthEast 
##                                     27.67144 
##                      SeedUtilise:regionSouth 
##                                     27.88834 
##                       SeedUtilise:regionWest 
##                                    -17.08233 
## methodTransplant:SeedUtilise:regionNorthEast 
##                                    -16.52435 
##     methodTransplant:SeedUtilise:regionSouth 
##                                    -19.49284 
##      methodTransplant:SeedUtilise:regionWest 
##                                     49.76579
par(mfrow = c(2, 2))
plot(mod) #residuals do not seem to spread randomly

#drop1(mod, test = "F")
m1 <- lm(YieldHA ~ method + SeedUtilise, data = mono_df)
coef(m1)
##      (Intercept) methodTransplant      SeedUtilise 
##        132.56982         80.13114         26.28754
par(mfrow = c(2, 2))
plot(m1)

m2 <- lm(YieldHA ~ method + SeedUtilise + region, data = mono_df)
coef(m2)
##      (Intercept) methodTransplant      SeedUtilise  regionNorthEast 
##       357.887466        97.203935         8.608792      -134.144122 
##      regionSouth       regionWest 
##      -119.691190        25.344632
par(mfrow = c(2, 2))
plot(m2)

mono_df$fittedYieldfull <- fitted(mod)
mono_df$fittedYieldMS <- fitted(m1)
mono_df$fittedYieldMSR <- fitted(m2)
colmodel <-
  c("M*S*R" = "red",
    "M+S+R" = "darkgreen",
    "M+S" = "blue")
ggplot(mono_df) +
  #geom_point(aes(x = SeedUtilise, y = YieldHA, col = method), shape = 2)+
  geom_point(aes(x = SeedUtilise, y = YieldHA, col = method), shape = 2) +
  geom_point(aes(x = SeedUtilise, y = fittedYieldfull, col = "M*S*R"), size = 1) +
  geom_point(aes(x = SeedUtilise, y = fittedYieldMS, col = "M+S"), size = 1) +
  geom_point(aes(x = SeedUtilise, y = fittedYieldMSR, col = "M+S+R"), size = 1) +
  labs(x = "Seed Utilisation Rate (kg/rai)", y = "Yield (kg/rai)") +
  scale_color_manual(values = colmodel) +
  labs(title = "Fitted values of Yield from 3 models", color = "Model factors") +
  #facet_wrap(~region)
  facet_wrap( ~ method)

#facet_grid(method~region)

#compare models
#1. using anova
mod <- lm(YieldHA ~ method * SeedUtilise * region, data = mono_df)
m1 <- lm(YieldHA ~ method + SeedUtilise, data = mono_df)
m2 <- lm(YieldHA ~ method + SeedUtilise + region, data = mono_df)
anova(mod, m1, m2)
## Analysis of Variance Table
## 
## Model 1: YieldHA ~ method * SeedUtilise * region
## Model 2: YieldHA ~ method + SeedUtilise
## Model 3: YieldHA ~ method + SeedUtilise + region
##   Res.Df     RSS  Df Sum of Sq       F    Pr(>F)    
## 1    299  650915                                    
## 2    312 1977556 -13  -1326641  46.877 < 2.2e-16 ***
## 3    309 1151450   3    826106 126.492 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F is high -> reject null that smaller model is better, so this means mod is better and the interactions between factors are significant (can help explain the variation in yield)

#2. also try using AIC - stepwise regression: start with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide an improvement in the model fit (like backward selection).
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
step.model <- stepAIC(mod, direction = "both",
                      trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = YieldHA ~ method * SeedUtilise * region, data = mono_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.005  -25.834   -1.738   21.740  143.289 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                   639.042     55.790  11.454
## methodTransplant                              -41.883     80.552  -0.520
## SeedUtilise                                   -23.054      5.416  -4.256
## regionNorthEast                              -335.890     70.034  -4.796
## regionSouth                                  -346.493     73.224  -4.732
## regionWest                                    131.164    100.847   1.301
## methodTransplant:SeedUtilise                   21.793      7.409   2.941
## methodTransplant:regionNorthEast               30.898     97.788   0.316
## methodTransplant:regionSouth                   83.742     95.687   0.875
## methodTransplant:regionWest                  -482.464    130.398  -3.700
## SeedUtilise:regionNorthEast                    27.671      7.200   3.843
## SeedUtilise:regionSouth                        27.888      8.904   3.132
## SeedUtilise:regionWest                        -17.082     10.281  -1.662
## methodTransplant:SeedUtilise:regionNorthEast  -16.524      9.915  -1.667
## methodTransplant:SeedUtilise:regionSouth      -19.493     10.508  -1.855
## methodTransplant:SeedUtilise:regionWest        49.766     12.504   3.980
##                                              Pr(>|t|)    
## (Intercept)                                   < 2e-16 ***
## methodTransplant                             0.603488    
## SeedUtilise                                  2.78e-05 ***
## regionNorthEast                              2.56e-06 ***
## regionSouth                                  3.44e-06 ***
## regionWest                                   0.194386    
## methodTransplant:SeedUtilise                 0.003522 ** 
## methodTransplant:regionNorthEast             0.752249    
## methodTransplant:regionSouth                 0.382186    
## methodTransplant:regionWest                  0.000257 ***
## SeedUtilise:regionNorthEast                  0.000148 ***
## SeedUtilise:regionSouth                      0.001907 ** 
## SeedUtilise:regionWest                       0.097656 .  
## methodTransplant:SeedUtilise:regionNorthEast 0.096629 .  
## methodTransplant:SeedUtilise:regionSouth     0.064562 .  
## methodTransplant:SeedUtilise:regionWest      8.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.66 on 299 degrees of freedom
## Multiple R-squared:  0.8254, Adjusted R-squared:  0.8166 
## F-statistic: 94.23 on 15 and 299 DF,  p-value: < 2.2e-16
as.data.frame(coef(step.model))
##                                              coef(step.model)
## (Intercept)                                         639.04245
## methodTransplant                                    -41.88269
## SeedUtilise                                         -23.05364
## regionNorthEast                                    -335.89031
## regionSouth                                        -346.49323
## regionWest                                          131.16428
## methodTransplant:SeedUtilise                         21.79267
## methodTransplant:regionNorthEast                     30.89769
## methodTransplant:regionSouth                         83.74198
## methodTransplant:regionWest                        -482.46376
## SeedUtilise:regionNorthEast                          27.67144
## SeedUtilise:regionSouth                              27.88834
## SeedUtilise:regionWest                              -17.08233
## methodTransplant:SeedUtilise:regionNorthEast        -16.52435
## methodTransplant:SeedUtilise:regionSouth            -19.49284
## methodTransplant:SeedUtilise:regionWest              49.76579
#3. using caret, with Cross validation
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
train.control <- trainControl(method = "cv", number = 10)
step.model.ca <-
  train(
    YieldHA ~ method * SeedUtilise * region,
    data = mono_df,
    method = "lmStepAIC",
    trControl = train.control,
    trace = FALSE
  )
# Final model coefficients
step.model.ca$finalModel
## 
## Call:
## lm(formula = .outcome ~ SeedUtilise + regionNorthEast + regionSouth + 
##     regionWest + `methodTransplant:SeedUtilise` + `methodTransplant:regionWest` + 
##     `SeedUtilise:regionNorthEast` + `SeedUtilise:regionSouth` + 
##     `SeedUtilise:regionWest` + `methodTransplant:SeedUtilise:regionNorthEast` + 
##     `methodTransplant:SeedUtilise:regionSouth` + `methodTransplant:SeedUtilise:regionWest`, 
##     data = dat)
## 
## Coefficients:
##                                    (Intercept)  
##                                        618.952  
##                                    SeedUtilise  
##                                        -21.121  
##                                regionNorthEast  
##                                       -322.205  
##                                    regionSouth  
##                                       -291.104  
##                                     regionWest  
##                                        151.255  
##                 `methodTransplant:SeedUtilise`  
##                                         17.975  
##                  `methodTransplant:regionWest`  
##                                       -524.346  
##                  `SeedUtilise:regionNorthEast`  
##                                         26.443  
##                      `SeedUtilise:regionSouth`  
##                                         20.803  
##                       `SeedUtilise:regionWest`  
##                                        -19.015  
## `methodTransplant:SeedUtilise:regionNorthEast`  
##                                        -13.990  
##     `methodTransplant:SeedUtilise:regionSouth`  
##                                         -9.802  
##      `methodTransplant:SeedUtilise:regionWest`  
##                                         53.584
# Summary of the model
summary(step.model.ca$finalModel)
## 
## Call:
## lm(formula = .outcome ~ SeedUtilise + regionNorthEast + regionSouth + 
##     regionWest + `methodTransplant:SeedUtilise` + `methodTransplant:regionWest` + 
##     `SeedUtilise:regionNorthEast` + `SeedUtilise:regionSouth` + 
##     `SeedUtilise:regionWest` + `methodTransplant:SeedUtilise:regionNorthEast` + 
##     `methodTransplant:SeedUtilise:regionSouth` + `methodTransplant:SeedUtilise:regionWest`, 
##     data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.441  -25.575   -1.577   22.348  143.289 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                     618.9521    40.1072  15.432
## SeedUtilise                                     -21.1208     3.9258  -5.380
## regionNorthEast                                -322.2050    48.4849  -6.645
## regionSouth                                    -291.1042    44.2578  -6.577
## regionWest                                      151.2547    92.8364   1.629
## `methodTransplant:SeedUtilise`                   17.9748     0.9831  18.284
## `methodTransplant:regionWest`                  -524.3465   102.1962  -5.131
## `SeedUtilise:regionNorthEast`                    26.4434     5.0197   5.268
## `SeedUtilise:regionSouth`                        20.8027     4.9874   4.171
## `SeedUtilise:regionWest`                        -19.0152     9.5532  -1.990
## `methodTransplant:SeedUtilise:regionNorthEast`  -13.9899     1.5503  -9.024
## `methodTransplant:SeedUtilise:regionSouth`       -9.8022     1.9901  -4.926
## `methodTransplant:SeedUtilise:regionWest`        53.5837    10.0863   5.312
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## SeedUtilise                                    1.50e-07 ***
## regionNorthEast                                1.41e-10 ***
## regionSouth                                    2.11e-10 ***
## regionWest                                       0.1043    
## `methodTransplant:SeedUtilise`                  < 2e-16 ***
## `methodTransplant:regionWest`                  5.17e-07 ***
## `SeedUtilise:regionNorthEast`                  2.63e-07 ***
## `SeedUtilise:regionSouth`                      3.97e-05 ***
## `SeedUtilise:regionWest`                         0.0474 *  
## `methodTransplant:SeedUtilise:regionNorthEast`  < 2e-16 ***
## `methodTransplant:SeedUtilise:regionSouth`     1.39e-06 ***
## `methodTransplant:SeedUtilise:regionWest`      2.10e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.5 on 302 degrees of freedom
## Multiple R-squared:  0.8248, Adjusted R-squared:  0.8179 
## F-statistic: 118.5 on 12 and 302 DF,  p-value: < 2.2e-16
#compare the result of coefficient from
as.data.frame(coef(mod))
##                                               coef(mod)
## (Intercept)                                   639.04245
## methodTransplant                              -41.88269
## SeedUtilise                                   -23.05364
## regionNorthEast                              -335.89031
## regionSouth                                  -346.49323
## regionWest                                    131.16428
## methodTransplant:SeedUtilise                   21.79267
## methodTransplant:regionNorthEast               30.89769
## methodTransplant:regionSouth                   83.74198
## methodTransplant:regionWest                  -482.46376
## SeedUtilise:regionNorthEast                    27.67144
## SeedUtilise:regionSouth                        27.88834
## SeedUtilise:regionWest                        -17.08233
## methodTransplant:SeedUtilise:regionNorthEast  -16.52435
## methodTransplant:SeedUtilise:regionSouth      -19.49284
## methodTransplant:SeedUtilise:regionWest        49.76579
as.data.frame(coef(step.model)) #same as above as no recalculation of the coef 
##                                              coef(step.model)
## (Intercept)                                         639.04245
## methodTransplant                                    -41.88269
## SeedUtilise                                         -23.05364
## regionNorthEast                                    -335.89031
## regionSouth                                        -346.49323
## regionWest                                          131.16428
## methodTransplant:SeedUtilise                         21.79267
## methodTransplant:regionNorthEast                     30.89769
## methodTransplant:regionSouth                         83.74198
## methodTransplant:regionWest                        -482.46376
## SeedUtilise:regionNorthEast                          27.67144
## SeedUtilise:regionSouth                              27.88834
## SeedUtilise:regionWest                              -17.08233
## methodTransplant:SeedUtilise:regionNorthEast        -16.52435
## methodTransplant:SeedUtilise:regionSouth            -19.49284
## methodTransplant:SeedUtilise:regionWest              49.76579
as.data.frame(step.model.ca$finalModel[["coefficients"]])
##                                                step.model.ca$finalModel[["coefficients"]]
## (Intercept)                                                                    618.952053
## SeedUtilise                                                                    -21.120778
## regionNorthEast                                                               -322.205049
## regionSouth                                                                   -291.104237
## regionWest                                                                     151.254676
## `methodTransplant:SeedUtilise`                                                  17.974759
## `methodTransplant:regionWest`                                                 -524.346453
## `SeedUtilise:regionNorthEast`                                                   26.443400
## `SeedUtilise:regionSouth`                                                       20.802718
## `SeedUtilise:regionWest`                                                       -19.015196
## `methodTransplant:SeedUtilise:regionNorthEast`                                 -13.989920
## `methodTransplant:SeedUtilise:regionSouth`                                      -9.802151
## `methodTransplant:SeedUtilise:regionWest`                                       53.583704
# Model accuracy
step.model.ca$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      none 47.35933 0.8136577 35.78955 5.027525 0.03911718 2.949323
#using content from http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/


testdataf <- data.frame(region = rep(c( "North"  ,   "NorthEast", "South"  ,   "West"), 3),SeedUtilise = c(rep(4,4),rep(8,4),rep(12,4)))
testdataf <- rbind(testdataf,testdataf)
testdataf$method <- c(rep("DirectSow",12) , rep("Transplant",12)) 
predict(mod, newdata = testdataf)
##        1        2        3        4        5        6        7        8 
## 546.8279 321.6233 311.8880 609.6628 454.6133 340.0945 331.2268 449.1189 
##        9       10       11       12       13       14       15       16 
## 362.3987 358.5657 350.5656 288.5750 592.1159 331.7116 362.9466 371.5502 
##       17       18       19       20       21       22       23       24 
## 587.0720 371.2561 391.4847 497.2402 582.0281 410.8006 420.0229 622.9301
testdataf$fitted <- predict(mod, newdata = testdataf)
ggplot(testdataf)+
  geom_col(aes(y=fitted,x = SeedUtilise, fill = method), position = "dodge")+
  facet_wrap(~region, nrow = 1)+
  scale_x_continuous(breaks = c(4,8,12))+
  labs(x= "Seed use rate (kg/rai)", y = "fitted yield from the best model (kg/rai)", caption = "In NE, the effects on yields of TP and DS are the most similar. 
       The increasing yield with seed use rates are also seen with both planting types in the South ans in NE. 
       However, in N, it seems that the higher seed use rate caused the lower yield. 
       Lastly, in the West, only for TP that the increasing trends with seed use rate is seen. 
       Therefore, we will look further into these 2 regions to understand these unusal results.")

#Plot of North
ggplot(SeedMethod %>% filter(region == "North", method %in% c("Transplant","DirectSow")))+
  geom_point(aes(y=YieldHA,x = SeedUtilise, col = Province))+
  facet_wrap(~method)+
  labs(x = "Seed use rate (kg/rai)", y = "Recorded Yield (kg/rai)", title = "North", caption = "The raw data do not contain seed use rate = 4 kg/rai for DS and TP, and no 8 kg/rai for TP, 
  it would have been an extrapolation at regional level when we examplified the previous graph. 
       For DS, only Lamphun and ChiangMai truly showed downward yield trends with increasing seed use. 
       For TP, it is hard to notice the trends. 
  However, one trend is that data points from the same provinces cluster quite close together")

#Plot of West
ggplot(SeedMethod %>% filter(region == "West", method %in% c("Transplant","DirectSow")))+
  geom_point(aes(y=YieldHA,x = SeedUtilise, col = Province))+
  facet_wrap(~method)+
  labs(x = "Seed use rate (kg/rai)", y = "Recorded Yield (kg/rai)", title = "West",caption = "For DS, data were only from two provinces,
       but, again, too few data points per province to further the investigation 
       and data points from the same provinces cluster quite close together ")

Summary: The best model is when using all three factors including the interaction terms, according to stepwise regression using AIC with 10-fold cross validation (Yield ~ method * SeedUtilise * region). The example new input data set shows in From the performance metric table, on average the model is off by 36-47 kg/rai. The unrealistic downward trends of yield with increasing seed use in N (for both planting methods) and W (for direct sowing) can be due to 1) extrapolation, 2) province-specific effect and we cannot investigate further because 1) too few data points for each province, 2) no other management variables given for each province.